Title: NBA Fan Engagment Purpose: Analyze trends on tweet oc19-oc20 Author: Alonso Gallegos email: cgallegos2109@student.hult.edu License: GPL>=3 Date: Jan 15, 2021
# Set the working directory
setwd("~/Documents/MBAN/R/hult_NLP_student/cases/NBA Fan Engagement/data")
# Libs
library(tm)
## Loading required package: NLP
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## Loading required package: qdapTools
## Loading required package: RColorBrewer
##
## Attaching package: 'qdap'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
##
## ngrams
## The following objects are masked from 'package:base':
##
## Filter, proportions
library(plotrix)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:qdapRegex':
##
## %+%
## The following object is masked from 'package:NLP':
##
## annotate
library(ggthemes)
library(ggalt)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
library(wordcloud)
library(RColorBrewer)
library(pbapply)
library(wordcloud2)
# Data
text <- read.csv('A_Oct2019.csv', header=TRUE)
text1 <- read.csv('B_Nov2019.csv', header=TRUE)
text2<- read.csv('C_Dec2019.csv', header=TRUE)
text3 <- read.csv('D_Jan2020.csv', header=TRUE)
text4 <- read.csv('E_Feb2020.csv', header=TRUE)
text5 <- read.csv('F_Mar2020.csv', header=TRUE)
text6 <- read.csv('G_Apr2020.csv', header=TRUE)
text7 <- read.csv('H_May2020.csv', header=TRUE)
text8 <- read.csv('I_June2020.csv', header=TRUE)
text9 <- read.csv('J_July2020.csv', header=TRUE)
text10 <- read.csv('K_Aug2020.csv', header=TRUE)
text11 <- read.csv('L_Sep2020.csv', header=TRUE)
text12 <- read.csv('M_Oct2020.csv', header=TRUE)
#Sampling
idx<- 1:nrow(text)
idx1<- 1:nrow(text1)
idx2<- 1:nrow(text2)
idx3<- 1:nrow(text3)
idx4<- 1:nrow(text4)
idx5<- 1:nrow(text5)
idx6<- 1:nrow(text6)
idx7<- 1:nrow(text7)
idx8<- 1:nrow(text8)
idx9<- 1:nrow(text9)
idx10<- 1:nrow(text10)
idx11<- 1:nrow(text11)
idx12<- 1:nrow(text12)
set.seed(1234)
idx<- sample(idx,1000)
set.seed(4567)
idx1<- sample(idx1,1000)
set.seed(5678)
idx2<- sample(idx2,1000)
set.seed(2345)
idx3<- sample(idx3,1000)
set.seed(3456)
idx4<- sample(idx4,1000)
set.seed(6789)
idx5<- sample(idx5,1000)
set.seed(7891)
idx6<- sample(idx6,1000)
set.seed(8912)
idx7<- sample(idx7,1000)
set.seed(9123)
idx8<- sample(idx8,1000)
set.seed(9876)
idx9<- sample(idx9,1000)
set.seed(8765)
idx10<- sample(idx10,1000)
set.seed(7654)
idx11<- sample(idx11,1000)
set.seed(6543)
idx12<- sample(idx12,1000)
sampleOct <-text[idx,]
sampleNov <-text1[idx1,]
sampleDec <-text2[idx2,]
sampleJan <-text3[idx3,]
sampleFeb <-text4[idx4,]
sampleMar <-text5[idx5,]
sampleApr <-text6[idx6,]
sampleMay <-text7[idx7,]
sampleJune <-text8[idx8,]
sampleJuly <-text9[idx9,]
sampleAug <-text10[idx10,]
sampleSept <-text11[idx11,]
sampleOct20 <-text12[idx12,]
#Saving new file
write.csv(sampleOct,'Sample_Oct_19.csv',row.names = F)
write.csv(sampleNov,'Sample_Noc_19.csv',row.names = F)
write.csv(sampleDec,'Sample_Dec_19.csv',row.names = F)
write.csv(sampleJan,'Sample_Jan_20.csv',row.names = F)
write.csv(sampleFeb,'Sample_Feb_20.csv',row.names = F)
write.csv(sampleMar,'Sample_Mar_20.csv',row.names = F)
write.csv(sampleApr,'Sample_Apr_20.csv',row.names = F)
write.csv(sampleMay,'Sample_May_20.csv',row.names = F)
write.csv(sampleJune,'Sample_June_20.csv',row.names = F)
write.csv(sampleJuly,'Sample_July_20.csv',row.names = F)
write.csv(sampleAug,'Sample_Aug_20.csv',row.names = F)
write.csv(sampleSept,'Sample_Sept_20.csv',row.names = F)
write.csv(sampleSept,'Sample_Oct_20.csv',row.names = F)
#Reading new sample tweets
S_Oct19<- read.csv('Sample_Oct_19.csv', header=TRUE)
S_Nov19<- read.csv('Sample_Noc_19.csv', header=TRUE)
S_Dec19<- read.csv('Sample_Dec_19.csv', header=TRUE)
S_Jan20<- read.csv('Sample_Jan_20.csv', header=TRUE)
S_Feb20<- read.csv('Sample_Feb_20.csv', header=TRUE)
S_Mar20<- read.csv('Sample_Mar_20.csv', header=TRUE)
S_Apr20<- read.csv('Sample_Apr_20.csv', header=TRUE)
S_May20<- read.csv('Sample_May_20.csv', header=TRUE)
S_June20<- read.csv('Sample_June_20.csv', header=TRUE)
S_July20<- read.csv('Sample_July_20.csv', header=TRUE)
S_Aug20<- read.csv('Sample_Aug_20.csv', header=TRUE)
S_Sept20<- read.csv('Sample_Sept_20.csv', header=TRUE)
S_Oct20<- read.csv('Sample_Oct_20.csv', header=TRUE)
# Options & Functions
options(stringsAsFactors = FALSE)
Sys.setlocale('LC_ALL','C')
## [1] "C/C/C/C/C/en_US.UTF-8"
tryTolower <- function(x){
y = NA
try_error = tryCatch(tolower(x), error = function(e) e)
if (!inherits(try_error, 'error'))
y = tolower(x)
return(y)
}
cleanCorpus<-function(corpus, customStopwords){
corpus <- tm_map(corpus, content_transformer(qdapRegex::rm_url))
#corpus <- tm_map(corpus, content_transformer(replace_contraction))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, content_transformer(tryTolower))
corpus <- tm_map(corpus, removeWords, customStopwords)
return(corpus)
}
# Create custom stop words
stops <- c(stopwords('english'),'win','team','season','game','la','game','httpstco','nba','basketball','rt', 'lol', 'amp', 'and','vs','atlanta', 'hawks','boston','celtics','brooklyn','nets','charlotte','hornets','chicago', 'bulls','cleveland','cavaliers','dallas', 'mavericks','denver','nuggets','detroit', 'pistons','golden', 'state', 'warriors','houston','rockets','indiana','pacers','los','angeles', 'clippers',
'lakers','memphis', 'grizzlies','miami', 'heat','milwaukee', 'bucks','minnesota', 'timberwolves',
'orleans', 'pelicans','new','york','knicks','oklahoma', 'thunder','orlando', 'magic','philadelphia', '76ers',
'phoenix', 'suns','portland', 'trail','blazers','sacramento', 'kings','san','antonio', 'spurs','toronto', 'raptors','utah', 'jazz','washington', 'wizards')
stops_team <-c(stopwords('english'),'win','team','season','la','game','nba','basketball','rt', 'lol', 'amp', 'and','game','vs','new','httpstco')
#Create bigram tokens
bigramTokens <-function(x){
unlist(lapply(NLP::ngrams(words(x), 2), paste, collapse = " "),
use.names = FALSE)
}
# As of tm version 0.7-3 tabular was deprecated
names(S_Oct19)[1] <-'doc_id'
names(S_Nov19)[1] <-'doc_id'
names(S_Dec19)[1] <-'doc_id'
names(S_Jan20)[1] <-'doc_id'
names(S_Feb20)[1] <-'doc_id'
names(S_Mar20)[1] <-'doc_id'
names(S_Apr20)[1] <-'doc_id'
names(S_May20)[1] <-'doc_id'
names(S_June20)[1] <-'doc_id'
names(S_July20)[1] <-'doc_id'
names(S_Aug20)[1] <-'doc_id'
names(S_Sept20)[1] <-'doc_id'
names(S_Oct20)[1] <-'doc_id'
# Make a volatile corpus
txtCorpusOct <- VCorpus(DataframeSource(S_Oct19))
txtCorpusNov <- VCorpus(DataframeSource(S_Nov19))
txtCorpusDec <- VCorpus(DataframeSource(S_Dec19))
txtCorpusJan <- VCorpus(DataframeSource(S_Jan20))
txtCorpusFeb <- VCorpus(DataframeSource(S_Feb20))
txtCorpusMar <- VCorpus(DataframeSource(S_Mar20))
txtCorpusApr <- VCorpus(DataframeSource(S_Apr20))
txtCorpusMay <- VCorpus(DataframeSource(S_May20))
txtCorpusJune <- VCorpus(DataframeSource(S_June20))
txtCorpusJuly <- VCorpus(DataframeSource(S_July20))
txtCorpusAug <- VCorpus(DataframeSource(S_Aug20))
txtCorpusSept <- VCorpus(DataframeSource(S_Sept20))
txtCorpusOct20 <- VCorpus(DataframeSource(S_Oct20))
# Preprocess the corpus
txtCorpusOct <- cleanCorpus(txtCorpusOct, stops)
txtCorpusNov <- cleanCorpus(txtCorpusNov, stops)
txtCorpusDec <- cleanCorpus(txtCorpusDec, stops)
txtCorpusJan <- cleanCorpus(txtCorpusJan, stops)
txtCorpusFeb <- cleanCorpus(txtCorpusFeb, stops)
txtCorpusMar <- cleanCorpus(txtCorpusMar, stops)
txtCorpusApr <- cleanCorpus(txtCorpusApr, stops)
txtCorpusMay <- cleanCorpus(txtCorpusMay, stops)
txtCorpusJune <- cleanCorpus(txtCorpusJune, stops)
txtCorpusJuly <- cleanCorpus(txtCorpusJuly, stops)
txtCorpusAug <- cleanCorpus(txtCorpusAug, stops)
txtCorpusSept <- cleanCorpus(txtCorpusSept, stops)
txtCorpusOct20 <- cleanCorpus(txtCorpusOct20, stops)
#Create TDM, Monogram
Oct19TDM <- TermDocumentMatrix(txtCorpusOct)
Oct19TDMm <- as.matrix(Oct19TDM)
Nov19TDM <- TermDocumentMatrix(txtCorpusNov)
Nov19TDMm <- as.matrix(Nov19TDM)
Dec19TDM <- TermDocumentMatrix(txtCorpusDec)
Dec19TDMm <- as.matrix(Dec19TDM)
Jan20TDM <- TermDocumentMatrix(txtCorpusJan)
Jan20TDMm <- as.matrix(Jan20TDM)
Feb20TDM <- TermDocumentMatrix(txtCorpusFeb)
Feb20TDMm <- as.matrix(Feb20TDM)
Mar20TDM <- TermDocumentMatrix(txtCorpusMar)
Mar20TDMm <- as.matrix(Mar20TDM)
Apr20TDM <- TermDocumentMatrix(txtCorpusApr)
Apr20TDMm <- as.matrix(Apr20TDM)
May20TDM <- TermDocumentMatrix(txtCorpusMay)
May20TDMm <- as.matrix(May20TDM)
June20TDM <- TermDocumentMatrix(txtCorpusJune)
June20TDMm <- as.matrix(June20TDM)
July20TDM <- TermDocumentMatrix(txtCorpusJuly)
July20TDMm <- as.matrix(July20TDM)
Aug20TDM <- TermDocumentMatrix(txtCorpusAug)
Aug20TDMm <- as.matrix(Aug20TDM)
Sept20TDM <- TermDocumentMatrix(txtCorpusSept)
Sept20TDMm <- as.matrix(Sept20TDM)
Oct20TDM <- TermDocumentMatrix(txtCorpusJuly)
Oct20TDMm <- as.matrix(Oct20TDM)
# Make bi-gram TDM according to the tokenize control & convert it to matrix
BiOct19TDM <- TermDocumentMatrix(txtCorpusOct, control=list(tokenize=bigramTokens))
BiOct19TDMm <- as.matrix(BiOct19TDM)
BiNov19TDM <- TermDocumentMatrix(txtCorpusNov, control=list(tokenize=bigramTokens))
BiNov19TDMm <- as.matrix(BiNov19TDM)
BiDec19TDM <- TermDocumentMatrix(txtCorpusDec, control=list(tokenize=bigramTokens))
BiDec19TDMm <- as.matrix(BiDec19TDM)
# Get Row Sums & organize
BiOct19TDM <- sort(rowSums(BiOct19TDMm), decreasing = TRUE)
Oct19DF <- data.frame(word = names(BiOct19TDM), freq = BiOct19TDM)
BiNov19TDM <- sort(rowSums(BiNov19TDMm), decreasing = TRUE)
Nov19DF <- data.frame(word = names(BiNov19TDM), freq = BiNov19TDM)
BiDec19TDM <- sort(rowSums(BiDec19TDMm), decreasing = TRUE)
Dec19DF <- data.frame(word = names(BiDec19TDM), freq = BiDec19TDM)
# Review all Palettes
display.brewer.all()
# Choose a color & drop light ones
pal <- brewer.pal(8, "Purples")
pal <- pal[-(1:2)]
# Make simple word cloud
# Reminder to expand device pane
set.seed(1234)
wordcloud(Dec19DF$word,
Dec19DF$freq,
max.words = 150,
random.order = FALSE,
colors = pal,
scale = c(2,0.5))
#Dinamyc word cloud
wordcloud2(Dec19DF[1:50,],
shape = "cardioid",
color = "blue",
backgroundColor = "pink")
wordcloud2(data = Nov19DF[1:50,])
?wordcloud2
# Frequency Data Frame
tweetSums <- rowSums(Oct19TDMm)
tweetFreq <- data.frame(word=names(tweetSums ),frequency=tweetSums )
# Review a section
tweetFreq[50:55,]
## word frequency
## agree agree 1
## agreed agreed 17
## agrees agrees 1
## ahead ahead 2
## ahmadmonk ahmadmonk 1
## ainge ainge 2
# Remove the row attributes meta family
rownames(tweetFreq) <- NULL
tweetFreq[50:55,]
## word frequency
## 50 agree 1
## 51 agreed 17
## 52 agrees 1
## 53 ahead 2
## 54 ahmadmonk 1
## 55 ainge 2
# Simple barplot; values greater than 15
topWords <- subset(tweetFreq, tweetFreq$frequency >= 10)
topWords <- topWords[order(topWords$frequency, decreasing=F),]
# Chg to factor for ggplot
topWords$word <- factor(topWords$word,
levels=unique(as.character(topWords$word)))
ggplot(topWords, aes(x=word, y=frequency)) +
geom_bar(stat="identity", fill='darkred') +
coord_flip()+ theme_gdocs() +
geom_text(aes(label=frequency), colour="white",hjust=1.25, size=3.0)
# qdap version, slightly different results based on params but faster
plot(freq_terms(S_Oct19$text, top=50, at.least=2, stopwords = stops))
plot(freq_terms(S_Nov19$text, top=50, at.least=2, stopwords = stops))
plot(freq_terms(S_Dec19$text, top=50, at.least=2, stopwords = stops))
plot(freq_terms(S_Jan20$text, top=50, at.least=2, stopwords = stops))
#Not using team names as stops
plot(freq_terms(S_Oct19$text, top=50, at.least=2, stopwords = stops_team))
plot(freq_terms(S_Nov19$text, top=50, at.least=2, stopwords = stops_team))
plot(freq_terms(S_Dec19$text, top=50, at.least=2, stopwords = stops_team))
plot(freq_terms(S_Jan20$text, top=50, at.least=2, stopwords = stops_team))
############ Back to PPT
# Inspect word associations
associations <- findAssocs(Dec19TDM, 'kawhi', 0.21)
associations
## $kawhi
## leonard youre set
## 0.74 0.47 0.43
## \360\237\221\217 lebron george\342\200\246
## 0.38 0.36 0.33
## leonard\342\200\231s take anthony
## 0.33 0.30 0.28
## davis paul anthonys
## 0.28 0.28 0.27
## ends ge\342\200\246 hoopsnbrews
## 0.27 0.27 0.27
## leonards outplayed pound
## 0.27 0.27 0.27
## battle clippernation early
## 0.26 0.22 0.22
## fts realskipbayless
## 0.22 0.22
# Organize the word associations
assocDF <- data.frame(terms=names(associations[[1]]),
value=unlist(associations))
assocDF$terms <- factor(assocDF$terms, levels=assocDF$terms)
rownames(assocDF) <- NULL
assocDF
## terms value
## 1 leonard 0.74
## 2 youre 0.47
## 3 set 0.43
## 4 \360\237\221\217 0.38
## 5 lebron 0.36
## 6 george\342\200\246 0.33
## 7 leonard\342\200\231s 0.33
## 8 take 0.30
## 9 anthony 0.28
## 10 davis 0.28
## 11 paul 0.28
## 12 anthonys 0.27
## 13 ends 0.27
## 14 ge\342\200\246 0.27
## 15 hoopsnbrews 0.27
## 16 leonards 0.27
## 17 outplayed 0.27
## 18 pound 0.27
## 19 battle 0.26
## 20 clippernation 0.22
## 21 early 0.22
## 22 fts 0.22
## 23 realskipbayless 0.22
# Make a dot plot
ggplot(assocDF, aes(y=terms)) +
geom_point(aes(x=value), data=assocDF, col='#c00c00') +
theme_gdocs() +
geom_text(aes(x=value,label=value), colour="red",hjust="inward", vjust ="inward" , size=3)
#Cluster data
# Reduce TDM
reducedTDM <- removeSparseTerms(Dec19TDM, sparse=0.985) #shoot for ~50 terms; 1.5% of cells in row have a value
reducedTDM
## <<TermDocumentMatrix (terms: 57, documents: 1000)>>
## Non-/sparse entries: 1366/55634
## Sparsity : 98%
## Maximal term length: 13
## Weighting : term frequency (tf)
# Organize the smaller TDM
reducedTDM <- as.data.frame(as.matrix(reducedTDM))
# Basic Hierarchical Clustering
hc <- hclust(dist(reducedTDM))
plot(hc,yaxt='n')
#WORD ASSOICATION
assocText <- rm_url(S_Oct19$text)
# MORE QDAP!
word_associate(assocText,
match.string = 'nike',
stopwords = stops,
network.plot = T,
wordcloud =T,
cloud.colors = c('black','darkred'))
## Warning in text2color(words = V(g)$label, recode.words = target.words, colors =
## label.colors): length of colors should be 1 more than length of recode.words
## row group unit text
## 1 103 all 103 RT @dbongino: Wait, I thought Nike was \342\200\234woke and all???? \360\237\221\207\360\237\217\273 \342\200\234Nike Pulls Houston Rockets Merchandise from Chinese Stores\342\200\235
## 2 397 all 397 227's\342\204\242 YouTube Chili' #NIKE'Spicy' #NBA Mix! #Rockets
## 3 484 all 484 RT @sportslogosnet: LEAK: Indiana #Pacers new yellow ""Statement"" edition uniforms, contains elements of the 1990s FloJo look #NBA #Nike P\342\200\246
## 4 513 all 513 RT @CBSNews: Mike Pence has accused Nike of ""checking its social conscience at the door"" for removing Houston Rockets merchandise from its\342\200\246
## 5 631 all 631 RT @FBillMcMorris: LeBron James, who has a $1 billion shoe deal with Nike, says pro-Hong Kong NBA exec needs to think more about others* h\342\200\246
## 6 757 all 757 LeBron's signature Nike shoes were well represented at Cleveland Cavaliers at media day
## 7 887 all 887 RT @thehill: VP Mike Pence: ""Nike stores in China actually removed their Houston Rockets merchandise from their shelves to join the Chinese\342\200\246
## 8 893 all 893 RT @ScottHendrick49: Golden State Coach Steve Kerr, you're an idiot when it comes to politics. How much did it cost Nike to buy you off? Ch\342\200\246
## 9 983 all 983 RT @CBSNews: Mike Pence has accused Nike of ""checking its social conscience at the door"" for removing Houston Rockets merchandise from its\342\200\246
## 10 991 all 991 RT @dbongino: Wait, I thought Nike was \342\200\234woke and all???? \360\237\221\207\360\237\217\273 \342\200\234Nike Pulls Houston Rockets Merchandise from Chinese Stores\342\200\235
##
## Match Terms
## ===========
##
## List 1:
## nike, <e2><80><9c>nike, nike'spicy'
##
#CLEANING TWEETS TO COMBINE
october <- VCorpus(VectorSource(S_Oct19$text))
november <- VCorpus(VectorSource(S_Nov19$text))
december <- VCorpus(VectorSource(S_Dec19$text))
october<- cleanCorpus(october,stops)
november<- cleanCorpus(november,stops)
december<- cleanCorpus(december,stops)
october <- unlist(pblapply(october, content))
november<- unlist(pblapply(november, content))
december<- unlist(pblapply(december, content))
october <- paste(october, collapse = ' ')
november <- paste(november, collapse = ' ')
december <- paste(december, collapse = ' ')
# Combine the subject documents into a corpus of *2* documents
allmonths <- c(october,november,december)
allmonths <- VCorpus((VectorSource(allmonths)))
# Make TDM with a different control parameter
# Tokenization `control=list(tokenize=bigramTokens)`
# You can have more than 1 ie `control=list(tokenize=bigramTokens, weighting = weightTfIdf)`
ctrl <- list(weighting = weightTfIdf)
#ctrl<- list( wordLengths=c(0,Inf))
allmonthsTDM <- TermDocumentMatrix(allmonths, control = ctrl)
allmonthsTDMm <- as.matrix(allmonthsTDM)
allmonthsTDMC <- TermDocumentMatrix(allmonths)
allmonthsTDMmC<- as.matrix(allmonthsTDMC)
# Make sure order is the same as the c(objA, objB) on line ~80
colnames(allmonthsTDMm) <- c('october', 'november','december')
colnames(allmonthsTDMmC) <- c('october', 'november','december')
# Examine
head(allmonthsTDMm)
## Docs
## Terms october november december
## aaarena 0.0001796195 0.0000000000 0.0000000000
## aahhhhs 0.0000000000 0.0000000000 0.0001837851
## aaron 0.0000000000 0.0000000000 0.0001837851
## aaro\342\200\246 0.0000000000 0.0000000000 0.0001837851
## aau 0.0000000000 0.0001891363 0.0000000000
## aba\342\200\246 0.0001796195 0.0000000000 0.0000000000
# Make comparison cloud
comparison.cloud(allmonthsTDMm,
max.words=150,
random.order=FALSE,
title.size=0.8,
colors=brewer.pal(ncol(allmonthsTDMm),"Dark2"),
scale=c(3,0.5))
commonality.cloud(allmonthsTDMmC,
max.words=150,
random.order=FALSE,
colors='blue',
scale=c(3.5,0.25))